In [1]:
import sys

sys.path.append("../")

import pandas as pd
from dotenv import load_dotenv

load_dotenv()
import os
import sqlite3

DB_PATH = os.getenv("DB_PATH")
DATA_PATH = "data"

conn = sqlite3.connect(DB_PATH)

if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

In [20]:
# Individuals Regions

df = pd.read_csv("../immaterial_index/results/df_individuals_score.csv", index_col=[0])
print(len(set(df.individual_wikidata_id)))

df[df['individual_name']=='Hesiod']

149547


Unnamed: 0,individual_wikidata_id,productive_year,decade,individual_name,score,region_name
190851,Q44233,-765.0,-760,Hesiod,147,Eastern Europe
190852,Q44233,-765.0,-760,Hesiod,147,Balkans
190853,Q44233,-765.0,-760,Hesiod,147,Greek World
190854,Q44233,-765.0,-760,Hesiod,147,mediterranean World


In [21]:
df

Unnamed: 0,individual_wikidata_id,productive_year,decade,individual_name,score,region_name
8,Q496891,1570.0,1570,Seong Hon,13,Korea
9,Q5163563,1802.0,1800,Constance Marie Charpentier,22,Northwestern Europe
10,Q5163563,1802.0,1800,Constance Marie Charpentier,22,Western Europe
11,Q5163563,1802.0,1800,Constance Marie Charpentier,22,Northern France
12,Q5163563,1802.0,1800,Constance Marie Charpentier,22,France
...,...,...,...,...,...,...
612996,Q78064636,1774.0,1770,Samuel Rodolphe Jeanneret,5,Switzerland
612997,Q3111520,1736.0,1740,Gottfried Boy,16,German world
612998,Q3111520,1736.0,1740,Gottfried Boy,16,Germany
612999,Q3111520,1736.0,1740,Gottfried Boy,16,Northwestern Europe


In [23]:
# Load works of individuals

df_ind_works = pd.read_sql_query("SELECT * FROM individual_created_work", conn)

df_count_work = (
    df_ind_works.groupby("individual_wikidata_id")["work_wikidata_id"]
    .count()
    .rename("count_works")
    .reset_index()
)
df_final = pd.merge(df, df_count_work, on="individual_wikidata_id", how="left")
df_final = df_final.fillna(0)  # When there is no works we add 0
df_final = df_final.drop('productive_year', axis=1)
df_final.to_csv(DATA_PATH + "/df_indi_works.csv")

## Add gender

In [24]:
df_gender = pd.read_sql_query("SELECT * FROM individual_gender", conn)
df_final_gender = pd.merge(df_gender[['individual_wikidata_id', 'gender']], df_final, on = 'individual_wikidata_id')
df_final_gender.to_csv(DATA_PATH + "/df_indi_works_gender.csv")

## Associate a category to CPs based on the type of their work

In [8]:
# df_work_type = pd.read_sql_query("SELECT * FROM created_work", conn)
# df_work_category = df_work_type[['work_wikidata_id', 'instance_label']].drop_duplicates()
# df_work_covariate = pd.merge(df_ind_works, df_work_category, on = ['work_wikidata_id'])
# df_work_covariate['instance_label'].value_counts().head(20)
# df_work_covariate.category.value_counts()

# # took the top 20 categories that occur the most and group them

# # Define mapping of original categories to new categories
# category_mapping = {
#     'painting': 'painting',
#     'biographical article': 'literary work',
#     'poem': 'literary work',
#     'print': 'print',
#     'literary work': 'literary work',
#     'version, edition, or translation': 'literary work',
#     'drawing': 'drawing',
#     'encyclopedia article': 'literary work',
#     'article': 'literary work',
#     'photograph': 'photograph',
#     'publication': 'literary work',
#     'watercolor painting': 'painting',
#     'scholarly article': 'literary work',
#     'sculpture': 'sculpture',
#     'engraving': 'print',
#     'poetry': 'literary work',
#     'written work': 'literary work',
#     'etching print': 'print',
# }
# df_work_covariate['category']  = df_work_covariate['instance_label'].apply(lambda x:category_mapping.get(x, None)) # drop None otherwise
# df_work_covariate = df_work_covariate.dropna()
# df_work_covariate = df_work_covariate[['individual_wikidata_id', 'work_wikidata_id', 'category']].drop_duplicates()




# count_works = df_work_covariate.groupby(['individual_wikidata_id', 'category']).size().reset_index()

# # Handle indivuals who have two of the same category
# diverse_categories = count_works[count_works['individual_wikidata_id'].duplicated(keep=False)]
# filtered_df = diverse_categories.groupby('individual_wikidata_id').filter(lambda x: x[0].nunique() == 1)
# filtered_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)
# filtered_df = filtered_df.drop_duplicates('individual_wikidata_id', keep = 'first')
# filtered_df = filtered_df[['individual_wikidata_id', 'category']]

# # now do the rest
# # remove already handled individuals
# other_categories = count_works[~count_works['individual_wikidata_id'].isin(filtered_df['individual_wikidata_id'])]
# other_categories = other_categories.sort_values(['individual_wikidata_id', 0], ascending=(False, False))
# other_categories = other_categories.groupby('individual_wikidata_id').head(1)
# other_categories = other_categories[['individual_wikidata_id', 'category']]

# final_categories = pd.concat([other_categories, filtered_df])
# final_categories = final_categories.reset_index(drop=True)

# df_final_category = pd.merge(df_final, final_categories, on = ['individual_wikidata_id'])
# df_final_category.to_csv(DATA_PATH + "/df_indi_works_category.csv")

# len(set(final_categories.individual_wikidata_id)) == 40732

# final_categories.category.value_counts()

## Extract Occupations

In [25]:
df_occupations = pd.read_sql_query("SELECT * FROM individual_occupations", conn)
df_occupations.groupby('individual_wikidata_id')['occupations_wikidata_id'].count().mean()

top_occupations= list(df_occupations.occupations_name.value_counts().head(50).index)

In [27]:
occupation_categories = {
    'painter': 'Work of art',
    'poet': 'Written work',
    'composer': 'Music',
    #'university teacher': 'Educator',
    'historian': 'Written work',
    'theologian': 'Written work',
    'architect': 'Architecture',
    'sculptor': 'Sculpture',
    'translator': 'Written work',
    'botanist': 'Science',
    #'artist': 'Visual Artist',
    'philosopher': 'Written work',
    'drawer': 'Work of art',
    #'photographer': 'Visual Artist',
    'mathematician': 'Science',
    #'pedagogue': 'Educator',
    'engraver': 'Sculpture',
    'playwright': 'Theater',
    #'explorer': 'Adventurer',
    'singer': 'Music',
    'musician': 'Music',
    'chemist': 'Science',
    'linguist': 'Written work',
    'organist': 'Music',
    'astronomer': 'Science',
    'entomologist': 'Science',
    'editor': 'Written work',
    'philologist': 'Science',
    'stage actor': 'Theater',
    'archaeologist': 'Science',
    'naturalist': 'Science',
    'printmaker': 'Work of art',
    'physicist': 'Science',
    'opera singer': 'Music',
    'novelist': 'Written work',
    'conductor': 'Music',
    'classical philologist': 'Science',
    #'professor': 'Educator',
    'cartographer': 'Science',
    'pianist': 'Music',
    'economist': 'Science',
    'copperplate engraver': 'Work of art',
    'illustrator': 'Work of art',
    'geologist': 'Science',
    'zoologist': 'Science',
    'scientific illustrator': 'Work of art',
    'scientist': 'Science',
    'geographer': 'Science',
    'art historian': 'Written work',
    'violinist': 'Music'
}

df_occupations['occupation'] = df_occupations['occupations_name'].apply(lambda x: occupation_categories.get(x, None))
df_occupations = df_occupations.dropna()
df_occupations.occupation.value_counts()

Written work    66595
Work of art     53588
Science         37704
Music           29267
Sculpture       12344
Architecture    11599
Theater          5736
Name: occupation, dtype: int64

In [40]:
df_unique_occupations = df_occupations[['individual_wikidata_id', 'occupation']].drop_duplicates()
df_unique_occupations = df_unique_occupations.sample(frac=1, random_state=42).reset_index(drop=True)
df_unique_occupations = df_unique_occupations.drop_duplicates('individual_wikidata_id', keep = 'first')

df_final_occupation = pd.merge(df_final, df_unique_occupations, on = 'individual_wikidata_id')
df_final_occupation = df_final_occupation[['individual_wikidata_id', 'decade', 'region_name', 'count_works', 'occupation']].drop_duplicates()

df_final_occupation.to_csv(DATA_PATH + "/df_indi_works_occupations.csv")

In [29]:
df_final_occupation[df_final_occupation['count_works']<=2].occupation.value_counts()

Written work    122709
Work of art      85659
Science          65422
Music            49906
Sculpture        25872
Architecture     25696
Theater          13154
Name: occupation, dtype: int64

In [15]:
df_clean_gdp = pd.read_sql_query("SELECT * FROM gdp_clean", conn)

region_code = pd.read_csv('../environnement_data/region_code_region_name.csv')
df_clean_gdp = pd.merge(df_clean_gdp, region_code, on  = 'region_code')


regions_clean = list(set(df_clean_gdp["region_name"]))

df = df[df["region_name"].isin(regions_clean)]
df = df[df["decade"] >= min(df_clean_gdp.year)]


df_final = pd.merge(df, df_count_work, on="individual_wikidata_id", how="left")
df_final = df_final.fillna(0)  # When there is no works we add 0
df_final.to_csv(DATA_PATH + "/df_indi_works_clean_gdp.csv")