In [1]:
import sys

sys.path.append("../")

import pandas as pd
from dotenv import load_dotenv

load_dotenv()
import os
import sqlite3

DB_PATH = os.getenv("DB_PATH")
DATA_PATH = "data"

conn = sqlite3.connect(DB_PATH)

if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

In [2]:
# Individuals Regions

df = pd.read_csv("../immaterial_index/results/df_individuals_score.csv")
print(len(set(df.individual_wikidata_id)))

df[df['individual_name']=='Hesiod']

159340


Unnamed: 0.1,Unnamed: 0,individual_wikidata_id,productive_year,decade,individual_name,score,region_name
297556,312663,Q44233,-715.0,-720,Hesiod,147,Eastern Europe
297557,312664,Q44233,-715.0,-720,Hesiod,147,Balkans
297558,312665,Q44233,-715.0,-720,Hesiod,147,mediterranean World
297559,312666,Q44233,-715.0,-720,Hesiod,147,Greek World


In [3]:


# Load works of individuals

df_ind_works = pd.read_sql_query("SELECT * FROM individual_created_work", conn)

df_count_work = (
    df_ind_works.groupby("individual_wikidata_id")["work_wikidata_id"]
    .count()
    .rename("count_works")
    .reset_index()
)
df_final = pd.merge(df, df_count_work, on="individual_wikidata_id", how="left")
df_final = df_final.fillna(0)  # When there is no works we add 0
df_final = df_final.drop('Unnamed: 0', axis=1)
df_final = df_final.drop('productive_year', axis=1)
df_final.to_csv(DATA_PATH + "/df_indi_works.csv")

In [4]:
df_gender = pd.read_sql_query("SELECT * FROM individual_gender", conn)
df_final_gender = pd.merge(df_gender[['individual_wikidata_id', 'gender']], df_final, on = 'individual_wikidata_id')
df_final_gender.to_csv(DATA_PATH + "/df_indi_works_gender.csv")

In [5]:

   # "source": [
   #    "# Individuals Regions\n",
   #    "\n",
   #    "df = pd.read_csv(\"../immaterial_index/results/df_individuals_score.csv\")\n",
   #    "print(len(set(df.individual_wikidata_id)))\n",
   #    "\n",
   #    "df[df['individual_name']=='Hesiod']"
   # ]

## Associate a category to CPs based on the type of their work

In [6]:
df_work_type = pd.read_sql_query("SELECT * FROM created_work", conn)

In [7]:
df_work_category = df_work_type[['work_wikidata_id', 'instance_label']].drop_duplicates()
df_work_covariate = pd.merge(df_ind_works, df_work_category, on = ['work_wikidata_id'])
df_work_covariate['instance_label'].value_counts().head(20)

instance_label
painting                            245949
biographical article                 95720
poem                                 66697
print                                59306
literary work                        50841
version, edition, or translation     40825
drawing                              29176
encyclopedia article                 28886
article                              24221
photograph                           20282
publication                          15320
watercolor painting                  11061
scholarly article                    10703
sculpture                             8181
engraving                             7618
poetry                                6371
work of art                           5657
written work                          5299
etching print                         5085
scenography sketch                    3305
Name: count, dtype: int64

In [8]:


# took the top 20 categories that occur the most and group them

# Define mapping of original categories to new categories
category_mapping = {
    'painting': 'painting',
    'biographical article': 'literary work',
    'poem': 'literary work',
    'print': 'print',
    'literary work': 'literary work',
    'version, edition, or translation': 'literary work',
    'drawing': 'drawing',
    'encyclopedia article': 'literary work',
    'article': 'literary work',
    'photograph': 'photograph',
    'publication': 'literary work',
    'watercolor painting': 'painting',
    'scholarly article': 'literary work',
    'sculpture': 'sculpture',
    'engraving': 'print',
    'poetry': 'literary work',
    'written work': 'literary work',
    'etching print': 'print',
}
df_work_covariate['category']  = df_work_covariate['instance_label'].apply(lambda x:category_mapping.get(x, None)) # drop None otherwise
df_work_covariate = df_work_covariate.dropna()
df_work_covariate = df_work_covariate[['individual_wikidata_id', 'work_wikidata_id', 'category']].drop_duplicates()



In [9]:
df_work_covariate.category.value_counts()

category
literary work    322170
painting         256858
print             62153
drawing           29176
photograph        20282
sculpture          8181
Name: count, dtype: int64

In [10]:
count_works = df_work_covariate.groupby(['individual_wikidata_id', 'category']).size().reset_index()

# Handle indivuals who have two of the same category
diverse_categories = count_works[count_works['individual_wikidata_id'].duplicated(keep=False)]
filtered_df = diverse_categories.groupby('individual_wikidata_id').filter(lambda x: x[0].nunique() == 1)
filtered_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)
filtered_df = filtered_df.drop_duplicates('individual_wikidata_id', keep = 'first')
filtered_df = filtered_df[['individual_wikidata_id', 'category']]

# now do the rest


# remove already handled individuals
other_categories = count_works[~count_works['individual_wikidata_id'].isin(filtered_df['individual_wikidata_id'])]
other_categories = other_categories.sort_values(['individual_wikidata_id', 0], ascending=(False, False))
other_categories = other_categories.groupby('individual_wikidata_id').head(1)
other_categories = other_categories[['individual_wikidata_id', 'category']]

final_categories = pd.concat([other_categories, filtered_df])
final_categories = final_categories.reset_index(drop=True)

df_final_category = pd.merge(df_final, final_categories, on = ['individual_wikidata_id'])
df_final_category.to_csv(DATA_PATH + "/df_indi_works_category.csv")

In [11]:
len(set(final_categories.individual_wikidata_id)) == 40732

True

In [12]:
final_categories.category.value_counts()

category
literary work    17801
painting         17412
print             2518
sculpture         1509
drawing           1020
photograph         472
Name: count, dtype: int64

## Extract Occupations

In [22]:
df_occupations = pd.read_sql_query("SELECT * FROM individual_occupations", conn)
df_occupations.groupby('individual_wikidata_id')['occupations_wikidata_id'].count().mean()

top_occupations= list(df_occupations.occupations_name.value_counts().head(50).index)

In [32]:
occupation_categories = {
    'painter': 'Work of art',
    'poet': 'Written work',
    'composer': 'Music',
    #'university teacher': 'Educator',
    'historian': 'Written work',
    'theologian': 'Written work',
    'architect': 'Architecture',
    'sculptor': 'Sculpture',
    'translator': 'Written work',
    'botanist': 'Scientist',
    #'artist': 'Visual Artist',
    'philosopher': 'Written work',
    'drawer': 'Work of art',
    #'photographer': 'Visual Artist',
    'mathematician': 'Scientist',
    #'pedagogue': 'Educator',
    'engraver': 'Sculpture',
    'playwright': 'Theater',
    #'explorer': 'Adventurer',
    'singer': 'Music',
    'musician': 'Music',
    'chemist': 'Scientist',
    'linguist': 'Written work',
    'organist': 'Music',
    'astronomer': 'Scientist',
    'entomologist': 'Scientist',
    'editor': 'Written work',
    'philologist': 'Science',
    'stage actor': 'Theater',
    'archaeologist': 'Scientist',
    'naturalist': 'Scientist',
    'printmaker': 'Work of art',
    'physicist': 'Scientist',
    'opera singer': 'Music',
    'novelist': 'Written work',
    'conductor': 'Music',
    'classical philologist': 'Scientist',
    #'professor': 'Educator',
    'cartographer': 'Scientist',
    'pianist': 'Music',
    'economist': 'Scientist',
    'copperplate engraver': 'Work of art',
    'illustrator': 'Work of art',
    'geologist': 'Scientist',
    'zoologist': 'Scientist',
    'scientific illustrator': 'Work of art',
    'scientist': 'Scientist',
    'geographer': 'Scientist',
    'art historian': 'Written work',
    'violinist': 'Music'
}

df_occupations['occupation'] = df_occupations['occupations_name'].apply(lambda x: occupation_categories.get(x, None))
df_occupations = df_occupations.dropna()
df_occupations.occupation.value_counts()

occupation
Written work    66595
Work of art     53588
Scientist       35488
Music           29267
Sculpture       12344
Architecture    11599
Theater          5736
Science          2216
Name: count, dtype: int64

In [36]:
df_final_occupation = pd.merge(df_final, df_occupations, on = 'individual_wikidata_id')
df_final_occupation = df_final_occupation[['individual_wikidata_id', 'decade', 'region_name', 'count_works', 'occupation']].drop_duplicates()
df_final_occupation.to_csv(DATA_PATH + "/df_indi_works_occupations.csv")

Unnamed: 0,individual_wikidata_id,decade,region_name,count_works,occupation
0,Q60835032,1820,Eastern Europe,0.0,Music
1,Q1100854,1860,Eastern Europe,0.0,Written work
2,Q3048804,1870,United Kingdom,0.0,Written work
3,Q1974313,1860,France,3.0,Scientist
4,Q12630095,1870,Italy,0.0,Written work
...,...,...,...,...,...
53291,Q11814103,1860,Eastern Europe,0.0,Architecture
53292,Q367749,1820,France,6.0,Scientist
53293,Q59850609,1820,United Kingdom,0.0,Sculpture
53294,Q6218728,1820,United Kingdom,0.0,Written work


In [16]:
df_clean_gdp = pd.read_sql_query("SELECT * FROM gdp_clean", conn)

region_code = pd.read_csv('../environnement_data/region_code_region_name.csv')
df_clean_gdp = pd.merge(df_clean_gdp, region_code, on  = 'region_code')


regions_clean = list(set(df_clean_gdp["region_name"]))

df = df[df["region_name"].isin(regions_clean)]
df = df[df["decade"] >= min(df_clean_gdp.year)]


df_final = pd.merge(df, df_count_work, on="individual_wikidata_id", how="left")
df_final = df_final.fillna(0)  # When there is no works we add 0
df_final.to_csv(DATA_PATH + "/df_indi_works_clean_gdp.csv")