In [1]:
import sys

sys.path.append("../")

import sqlite3
import typing as t
import pandas as pd

pd.options.mode.chained_assignment = None


def clean_list_occupation(
    clean_dict: t.List[dict], list_targeted: t.List[str]
) -> t.List[str]:
    for list_model in clean_dict:
        occupation_pair = list_model["occupation_pair"]
        remove_occupation = list_model["remove_occupation"]

        if all(elem in list_targeted for elem in occupation_pair):
            list_targeted.remove(remove_occupation)
        else:
            pass

    return list_targeted


def check(list_targeted):
    if all(elem in list_targeted for elem in ["botanist", "biologist"]):
        return 1
    else:
        return 0


def check_if_contains(list, occupation: str):
    if occupation in list:
        return 1
    else:
        return 0


def get_proportionned_occupations(
    data_final, data_merge, occupation_select="biologist"
):
    list_inds = list(
        set(data_merge[data_merge["meta_occupation"] == occupation_select].wikidata_id)
    )
    data_select = data_merge[data_merge["wikidata_id"].isin(list_inds)]

    def get_len_occupation(x="zoologist"):
        len_occ_1 = len(
            set(data_select[data_select["meta_occupation"] == x].wikidata_id)
        )
        res = round(len_occ_1 / len(list_inds), 1)
        return res

    len_zoologist = get_len_occupation(x="zoologist")
    len_botanist = get_len_occupation(x="botanist")
    len_anatomist = get_len_occupation(x="anatomist")

    data_final_without_bio = data_final[
        data_final["meta_occupation"] != occupation_select
    ]
    data_final_biologists = data_final[
        data_final["meta_occupation"] == occupation_select
    ]

    number_zoologists = int(len_zoologist * len(data_final_biologists))
    number_botanists = int(len_botanist * len(data_final_biologists))
    number_anatomists = int(len_anatomist * len(data_final_biologists))

    data_final_biologists["meta_occupation"][:number_zoologists] = "zoologist"
    data_final_biologists["meta_occupation"][
        number_zoologists : number_zoologists + number_botanists
    ] = "botanist"
    data_final_biologists["meta_occupation"][
        number_zoologists + number_botanists :
    ] = "anatomist"

    df_final_new = pd.concat([data_final_without_bio, data_final_biologists])

    return df_final_new



In [2]:
from functions.env import DATA_PATH,  DB_SCIENCE_PATH_NEW

conn = sqlite3.connect(DB_SCIENCE_PATH_NEW)
data = pd.read_sql("SELECT * FROM individuals_occupation_information", conn)
print(len(set(data.individual_wikidata_id)))

data = data.rename(
    columns={
        "occupations_name": "occupation",
        "individual_wikidata_id": "wikidata_id",
    }
)

data[data['occupation']=='philosopher']

data

71331


Unnamed: 0,wikidata_id,individual_name,occupations_wikidata_id,occupation,occupations_category,birthyear,productive_year
0,Q55844134,Ernst von Heeringen,Q16031530,music theorist,science|writer,1810.0,1845.0
1,Q2450610,Wojciech Jastrzębowski,Q864503,biologist,science|writer,1799.0,1834.0
2,Q2450610,Wojciech Jastrzębowski,Q2374149,botanist,science|writer,1799.0,1834.0
3,Q246595,Joseph Smit,Q1225716,ornithologist,science|writer,1836.0,1871.0
4,Q23407086,Henry Keyes Jordan,Q520549,geologist,science|writer,1838.0,1873.0
...,...,...,...,...,...,...,...
91180,Q3619606,Antonio Frizzi,Q201788,historian,writer,1736.0,1771.0
91181,Q3856471,Michelangelo Ziccardi,Q201788,historian,writer,1802.0,1837.0
91182,Q1539502,Gottfried Weigle,Q14467526,linguist,writer,1816.0,1851.0
91183,Q3105876,Gilbert Tennent,Q1234713,theologian,writer,1703.0,1738.0


In [3]:
df_annotation = pd.read_excel(
    DATA_PATH + "/ENS - True Science.xlsx", sheet_name="cleaning_top_occupations"
)

df_annotation = df_annotation[df_annotation["erase"].isna()]
df_annotation = df_annotation[df_annotation["count_occupation"] >= 10]
len(set(df_annotation.meta_occupation))

df_annotation = df_annotation[["occupation", "meta_occupation"]].reset_index(
    drop=True
)



In [4]:
clean_dict = pd.read_excel(
    DATA_PATH + "/ENS - True Science.xlsx", sheet_name="co_occurence_occupation"
)
clean_dict = (
    clean_dict[["source", "target", "remove_occupation"]]
    .dropna()
    .reset_index(drop=True)
)
clean_dict["occupation_pair"] = clean_dict.apply(
    lambda x: [x["source"], x["target"]], axis=1
)
clean_dict = clean_dict[["occupation_pair", "remove_occupation"]].to_dict(
    orient="records"
)


In [5]:
data_merge = pd.merge(data, df_annotation, on="occupation")
data_merge = data_merge.drop("occupation", axis=1).drop_duplicates()
len(set(data_merge.meta_occupation))

33

In [6]:
differences = set(df_annotation.meta_occupation)- set(data_merge.meta_occupation)
differences

set()

In [7]:




data_group = (
    data_merge.groupby("wikidata_id")["meta_occupation"].apply(list).reset_index()
)
data_group["meta_occupation"] = data_group["meta_occupation"].apply(
    lambda x: list(set(x))
)
data_group["meta_occupation"] = data_group["meta_occupation"].apply(
    lambda x: clean_list_occupation(clean_dict, x)
)
data_final = data_group.explode("meta_occupation")

df_final_new = get_proportionned_occupations(
    data_final, data_merge, occupation_select="biologist"
)
df_final_new = get_proportionned_occupations(
    df_final_new, data_merge, occupation_select="naturalist"
)
df_final_new = df_final_new.drop_duplicates().reset_index(drop=True)

replace_occupation = {
    "demographer": "geographer",
    "miitary specialist": "historian",
    "criminologist": "sociologist",
}

df_final_new["meta_occupation"] = df_final_new["meta_occupation"].apply(
    lambda x: replace_occupation.get(x, x)
)


In [8]:
df_final_new.to_sql(
    "cleaned_occupations_science", conn, if_exists="replace", index=False
)

print(len(set(df_final_new.meta_occupation)))


28


In [9]:
df_final_new.sort_values('wikidata_id', ascending=False)

list(set(df_final_new.meta_occupation))

['philosopher',
 'linguist',
 'musicologist',
 'astronomer',
 'mathematician',
 'art theorist',
 'chemist',
 'economist',
 'physicist',
 'botanist',
 'paleontologist',
 'political scientist',
 'geologist',
 'geographer',
 'epidemiologist',
 'zoologist',
 'geneticist',
 'anthropologist',
 'theologian',
 'historian',
 'ecologist',
 'meteorologist',
 'sociologist',
 'archeologist',
 'taxonomist',
 'anatomist',
 'pharmacologist',
 'logician']

historian              13611
theologian             12144
botanist                7668
philosopher             5937
mathematician           4733
zoologist               4723
geographer              2984
chemist                 2712
linguist                2579
astronomer              2530
archeologist            2139
physicist               1927
geologist               1918
anatomist               1734
economist               1620
musicologist            1152
anthropologist          1075
paleontologist           597
meteorologist            365
sociologist              199
pharmacologist           186
political scientist       87
logician                  37
art theorist              37
epidemiologist            31
taxonomist                21
ecologist                 17
geneticist                14
Name: meta_occupation, dtype: int64