In [1]:
import sys

sys.path.append("../")

import pandas as pd
from dotenv import load_dotenv

load_dotenv()
import os
import sqlite3

DB_PATH = os.getenv("DB_PATH")
DATA_PATH = "data"

conn = sqlite3.connect(DB_PATH)

if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

In [2]:
# Individuals Regions

df = pd.read_csv("../immaterial_index/results/df_individuals_score.csv")
print(len(set(df.individual_wikidata_id)))

df[df['individual_name']=='Hesiod']

159340


Unnamed: 0.1,Unnamed: 0,individual_wikidata_id,productive_year,decade,individual_name,score,region_name
297556,312663,Q44233,-715.0,-720,Hesiod,147,Eastern Europe
297557,312664,Q44233,-715.0,-720,Hesiod,147,Balkans
297558,312665,Q44233,-715.0,-720,Hesiod,147,mediterranean World
297559,312666,Q44233,-715.0,-720,Hesiod,147,Greek World


In [3]:


# Load works of individuals

df_ind_works = pd.read_sql_query("SELECT * FROM individual_created_work", conn)

df_count_work = (
    df_ind_works.groupby("individual_wikidata_id")["work_wikidata_id"]
    .count()
    .rename("count_works")
    .reset_index()
)
df_final = pd.merge(df, df_count_work, on="individual_wikidata_id", how="left")
df_final = df_final.fillna(0)  # When there is no works we add 0
df_final = df_final.drop('Unnamed: 0', axis=1)
df_final = df_final.drop('productive_year', axis=1)
df_final.to_csv(DATA_PATH + "/df_indi_works.csv")


In [4]:
df_gender = pd.read_sql_query("SELECT * FROM individual_gender", conn)

df_final_gender = pd.merge(df_gender[['individual_wikidata_id', 'gender']], df_final, on = 'individual_wikidata_id')
df_final_gender.to_csv(DATA_PATH + "/df_indi_works_gender.csv")

## Associate a category to CPs based on the type of their work

In [5]:
df_work_type = pd.read_sql_query("SELECT * FROM created_work", conn)

instance_label
painting                            249395
poem                                133174
biographical article                 95714
version, edition, or translation     82882
print                                57663
                                     ...  
grand staircase                          1
scenic viewpoint                         1
wine column                              1
mythological Greek character             1
dastan                                   1
Name: count, Length: 2673, dtype: int64

In [23]:
df_work_category = df_work_type[['work_wikidata_id', 'instance_label']].drop_duplicates()
df_work_covariate = pd.merge(df_ind_works, df_work_category, on = ['work_wikidata_id'])
df_work_covariate['instance_label'].value_counts().head(20)

# took the top 20 categories that occur the most and group them

# Define mapping of original categories to new categories
category_mapping = {
    'painting': 'painting',
    'biographical article': 'literary work',
    'poem': 'literary work',
    'print': 'print',
    'literary work': 'literary work',
    'version, edition, or translation': 'literary work',
    'drawing': 'drawing',
    'encyclopedia article': 'literary work',
    'article': 'literary work',
    'photograph': 'photograph',
    'publication': 'literary work',
    'watercolor painting': 'painting',
    'scholarly article': 'literary work',
    'sculpture': 'sculpture',
    'engraving': 'engraving',
    'poetry': 'literary work',
    'written work': 'literary work',
    'etching print': 'print',
}
df_work_covariate['category']  = df_work_covariate['instance_label'].apply(lambda x:category_mapping.get(x, None)) # drop None otherwise
df_work_covariate = df_work_covariate.dropna()
df_work_covariate = df_work_covariate[['individual_wikidata_id', 'work_wikidata_id', 'category']].drop_duplicates()



Unnamed: 0,individual_wikidata_id,work_wikidata_id,category
0,Q13129165,Q76903996,literary work
1,Q13129165,Q76904746,literary work
2,Q13129165,Q76905321,literary work
3,Q13129165,Q76905845,literary work
4,Q13129165,Q76905846,literary work
...,...,...,...
852177,Q307365,Q19505126,literary work
852178,Q307365,Q19505637,literary work
852179,Q307365,Q19506597,literary work
852180,Q307365,Q20385416,literary work


In [90]:
count_works = df_work_covariate.groupby(['individual_wikidata_id', 'category']).size().reset_index()

# Handle indivuals who have two of the same category
diverse_categories = count_works[count_works['individual_wikidata_id'].duplicated(keep=False)]
filtered_df = diverse_categories.groupby('individual_wikidata_id').filter(lambda x: x[0].nunique() == 1)
filtered_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)
filtered_df = filtered_df.drop_duplicates('individual_wikidata_id', keep = 'first')
filtered_df = filtered_df[['individual_wikidata_id', 'category']]

# now do the rest


# remove already handled individuals
other_categories = count_works[~count_works['individual_wikidata_id'].isin(filtered_df['individual_wikidata_id'])]
other_categories = other_categories.sort_values(['individual_wikidata_id', 0], ascending=(False, False))
other_categories = other_categories.groupby('individual_wikidata_id').head(1)
other_categories = other_categories[['individual_wikidata_id', 'category']]

final_categories = pd.concat([other_categories, filtered_df])
final_categories = final_categories.reset_index(drop=True)
len(set(final_categories.individual_wikidata_id)) == 40732

df_final_category = pd.merge(df_final, final_categories, on = ['individual_wikidata_id'])
df_final_category.to_csv(DATA_PATH + "/df_indi_works_category.csv")

Unnamed: 0.1,Unnamed: 0,individual_wikidata_id,productive_year,decade,individual_name,score,region_name,count_works,category
0,25,Q1871519,1834.0,1830,Louis Barré,34,France,6.0,literary work
1,37,Q1974313,1858.0,1860,Victor Lemoine,13,France,3.0,literary work
2,84,Q16198759,1877.0,1880,Gustavo Mancinelli,19,Italy,2.0,painting
3,128,Q18923454,1854.0,1850,Alexandre Ségé,17,France,7.0,painting
4,178,Q3086433,1864.0,1860,Zénaïde Fleuriot,32,France,17.0,literary work
...,...,...,...,...,...,...,...,...,...
12169,568340,Q125409,1848.0,1850,Charles Jacque,76,France,170.0,painting
12170,568394,Q5924636,1829.0,1830,Jacques Duret,10,France,2.0,literary work
12171,568501,Q16322467,1859.0,1860,Louis Lemaire,15,France,1.0,painting
12172,568521,Q367749,1817.0,1820,Louis-René Villermé,43,France,6.0,literary work


In [11]:
df_clean_gdp = pd.read_sql_query("SELECT * FROM gdp_clean", conn)

region_code = pd.read_csv('../environnement_data/region_code_region_name.csv')
df_clean_gdp = pd.merge(df_clean_gdp, region_code, on  = 'region_code')


regions_clean = list(set(df_clean_gdp["region_name"]))

df = df[df["region_name"].isin(regions_clean)]
df = df[df["decade"] >= min(df_clean_gdp.year)]


df_final = pd.merge(df, df_count_work, on="individual_wikidata_id", how="left")
df_final = df_final.fillna(0)  # When there is no works we add 0
df_final.to_csv(DATA_PATH + "/df_indi_works_clean_gdp.csv")