In [73]:
import sys

sys.path.append("../")

import sqlite3
import typing as t

import pandas as pd

pd.options.mode.chained_assignment = None

def clean_list_occupation(
    clean_dict: t.List[dict], list_targeted: t.List[str]
) -> t.List[str]:
    for list_model in clean_dict:
        occupation_pair = list_model["occupation_pair"]
        remove_occupation = list_model["remove_occupation"]

        if all(elem in list_targeted for elem in occupation_pair):
            list_targeted.remove(remove_occupation)
        else:
            pass

    return list_targeted


def check(list_targeted):
    if all(elem in list_targeted for elem in ["botanist", "biologist"]):
        return 1
    else:
        return 0


def check_if_contains(list, occupation: str):
    if occupation in list:
        return 1
    else:
        return 0


def get_proportionned_occupations(
    data_final, data_merge, occupation_select="biologist"
):
    list_inds = list(
        set(data_merge[data_merge["meta_occupation"] == occupation_select].wikidata_id)
    )
    data_select = data_merge[data_merge["wikidata_id"].isin(list_inds)]

    def get_len_occupation(x="zoologist"):
        len_occ_1 = len(
            set(data_select[data_select["meta_occupation"] == x].wikidata_id)
        )
        res = round(len_occ_1 / len(list_inds), 1)
        return res

    len_zoologist = get_len_occupation(x="zoologist")
    len_botanist = get_len_occupation(x="botanist")
    len_anatomist = get_len_occupation(x="anatomist")

    data_final_without_bio = data_final[
        data_final["meta_occupation"] != occupation_select
    ]
    data_final_biologists = data_final[
        data_final["meta_occupation"] == occupation_select
    ]

    number_zoologists = int(len_zoologist * len(data_final_biologists))
    number_botanists = int(len_botanist * len(data_final_biologists))
    number_anatomists = int(len_anatomist * len(data_final_biologists))

    data_final_biologists["meta_occupation"][:number_zoologists] = "zoologist"
    data_final_biologists["meta_occupation"][
        number_zoologists : number_zoologists + number_botanists
    ] = "botanist"
    data_final_biologists["meta_occupation"][
        number_zoologists + number_botanists :
    ] = "anatomist"

    df_final_new = pd.concat([data_final_without_bio, data_final_biologists])

    return df_final_new

In [74]:
from functions.env import DATA_PATH, DB_SCIENCE_PATH

In [75]:
data = pd.read_csv(DATA_PATH + "/df_indi_occupations.csv", index_col=[0])

In [76]:
df_annotation = pd.read_excel(
        DATA_PATH + "/ENS - True Science.xlsx", sheet_name="cleaning_top_occupations"
    )

df_annotation = df_annotation[df_annotation["erase"].isna()]
df_annotation = df_annotation[df_annotation["count_occupation"] >= 10]
df_annotation = df_annotation[["occupation", "meta_occupation"]].reset_index(
    drop=True
)

In [77]:
clean_dict = pd.read_excel(
    DATA_PATH + "/ENS - True Science.xlsx", sheet_name="co_occurence_occupation"
)

clean_dict = (
    clean_dict[["source", "target", "remove_occupation"]]
    .dropna()
    .reset_index(drop=True)
)
clean_dict["occupation_pair"] = clean_dict.apply(
    lambda x: [x["source"], x["target"]], axis=1
)

clean_dict = clean_dict[["occupation_pair", "remove_occupation"]].to_dict(
    orient="records"
)


data_merge = pd.merge(data, df_annotation, on="occupation")
data_merge = data_merge.drop("occupation", axis=1).drop_duplicates()


data_group = (
        data_merge.groupby("wikidata_id")["meta_occupation"].apply(list).reset_index()
    )

data_group["meta_occupation"] = data_group["meta_occupation"].apply(
        lambda x: list(set(x))
    )

data_group["meta_occupation"] = data_group["meta_occupation"].apply(
        lambda x: clean_list_occupation(clean_dict, x)
    )

data_final = data_group.explode("meta_occupation")

In [78]:
list_inds = list(
    set(data_merge[data_merge["meta_occupation"] == "biologist"].wikidata_id)
)
data_select = data_merge[data_merge["wikidata_id"].isin(list_inds)]

In [79]:
data_select.meta_occupation.value_counts()

biologist              433
zoologist              203
botanist               154
naturalist              85
anatomist               72
geologist               28
philosopher             28
chemist                 26
anthropologist          26
physicist               22
paleontologist          21
historian               19
archeologist            13
mathematician           13
geographer              12
astronomer              11
linguist                 8
economist                5
theologian               5
geneticist               4
meteorologist            4
ecologist                2
musicologist             2
political scientist      1
sociologist              1
taxonomist               1
pharmacologist           1
art theorist             1
epidemiologist           1
Name: meta_occupation, dtype: int64

In [80]:
percent_zoologist = 203/433
percent_zoologist

0.46882217090069284