In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
import sqlite3

# Load environment variables
load_dotenv()

# Define paths
DATA_PATH = "db_extract"
DB_PATH = os.getenv("DB_PATH")

# Connect to the database
conn = sqlite3.connect(DB_PATH)

# Ensure the data path exists
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

# Load and process the individuals' data
df = pd.read_csv(f"{DATA_PATH}/df_individuals_score.csv", index_col=[0])
print(f"Unique individuals: {len(set(df.individual_wikidata_id))}")

Unique individuals: 172951


### Load individuals with works

In [2]:
# Load works of individuals and process data

# Load works data
df_ind_works = pd.read_sql_query("SELECT * FROM individual_created_work", conn)

# Count works per individual
df_count_work = df_ind_works.groupby("individual_wikidata_id")["work_wikidata_id"].count().rename("count_works").reset_index()

# Merge with original data and clean up
df_final = pd.merge(df, df_count_work, on="individual_wikidata_id", how="left").fillna(0).drop('productive_year', axis=1)

# Save the final data
df_final.to_csv(DATA_PATH + "/df_indi_works.csv")

Unnamed: 0,individual_wikidata_id,individual_name,work_wikidata_id,work_name
0,Q13129165,Hugh Hughes,Q76903996,"Adolygiad ar draethawd Mr. Eliseus Cole, ar be..."
1,Q13129165,Hugh Hughes,Q76904746,Crynodeb gramadeg Cymraeg : yn nghyda chyfarwy...
2,Q13129165,Hugh Hughes,Q76905321,"Gramadeg Cymraeg : sef, ieithiadur athronyddol..."
3,Q13129165,Hugh Hughes,Q76905845,Llyfr ar resymeg
4,Q13129165,Hugh Hughes,Q76905846,Llyfr ar resymeg
...,...,...,...,...
778821,Q307365,Abdul Qadir Gilani,Q19505126,Q19505126
778822,Q307365,Abdul Qadir Gilani,Q19505637,Q19505637
778823,Q307365,Abdul Qadir Gilani,Q19506597,Q19506597
778824,Q307365,Abdul Qadir Gilani,Q20385416,Q20385416


In [3]:
print(len(set(df_final.individual_wikidata_id)))

172951


In [4]:
# Filter and count individuals with 0 or 1 work
filtered_individuals = len(set(df_final[df_final['count_works'].isin([0,1])].individual_wikidata_id))

# Load gender data and merge
df_gender = pd.read_sql_query("SELECT * FROM individual_gender", conn)
df_final_gender = pd.merge(df_gender[['individual_wikidata_id', 'gender']], df_final, on='individual_wikidata_id')
df_final_gender.to_csv(DATA_PATH + "/df_indi_works_gender.csv")


In [5]:
# Load occupations data and process
df_occupations = pd.read_sql_query("SELECT * FROM individual_occupations", conn)
len(set(df_occupations.occupations_name))

1102

In [6]:
df_final.to_csv(DATA_PATH + "/df_indi_works.csv")

In [7]:
# Define occupation categories
occupation_categories = {
    'painter': 'Painter',
    'poet': 'Writer',
    'composer': 'Musician',
    'historian': 'Writer',
    'theologian': 'Writer',
    'architect': 'Architect',
    'sculptor': 'Sculptor',
    'translator': 'Writer',
    'botanist': 'Writer',
    'philosopher': 'Writer',
    'drawer': 'Painter',
    'mathematician': 'Writer',
    'engraver': 'Sculptor',
    'playwright': 'Performing Artist',
    'singer': 'Musician',
    'musician': 'Musician',
    'chemist': 'Writer',
    'linguist': 'Writer',
    'organist': 'Musician',
    'astronomer': 'Writer',
    'entomologist': 'Writer',
    'editor': 'Writer',
    'philologist': 'Writer',
    'stage actor': 'Performing Artist',
    'archaeologist': 'Writer',
    'naturalist': 'Writer',
    'printmaker': 'Painter',
    'physicist': 'Writer',
    'opera singer': 'Musician',
    'novelist': 'Writer',
    'conductor': 'Musician',
    'classical philologist': 'Writer',
    'cartographer': 'Writer',
    'pianist': 'Musician',
    'economist': 'Writer',
    'copperplate engraver': 'Sculptor',
    'illustrator': 'Painter',
    'geologist': 'Writer',
    'zoologist': 'Writer',
    'scientific illustrator': 'Painter',
    'scientist': 'Writer',
    'geographer': 'Writer',
    'art historian': 'Painter',
    'violinist': 'Musician'
}

df_occupations['occupation'] = df_occupations['occupations_name'].apply(lambda x: occupation_categories.get(x, None))
df_occupations = df_occupations.dropna()

df_unique_occupations = df_occupations[['individual_wikidata_id', 'occupation']].drop_duplicates()
df_unique_occupations = df_unique_occupations.sample(frac=1, random_state=42).reset_index(drop=True)
df_unique_occupations = df_unique_occupations.drop_duplicates('individual_wikidata_id', keep = 'first')

df_final_occupation = pd.merge(df_final, df_unique_occupations, on = 'individual_wikidata_id')
df_final_occupation = df_final_occupation[['individual_wikidata_id', 'decade', 'region_name', 'count_works', 'occupation']].drop_duplicates()
df_final_occupation.to_csv(DATA_PATH + "/df_indi_works_occupations.csv")

In [8]:
df_final_occupation

Unnamed: 0,individual_wikidata_id,decade,region_name,count_works,occupation
0,Q1000034,1850,German world,0.0,Writer
1,Q1000034,1850,Germany,0.0,Writer
2,Q1000034,1850,Northwestern Europe,0.0,Writer
3,Q1000034,1850,Western Europe,0.0,Writer
4,Q100022441,1770,Northern France,0.0,Performing Artist
...,...,...,...,...,...
474182,Q999920,1790,Northwestern Europe,2.0,Painter
474183,Q999920,1790,Western Europe,2.0,Painter
474184,Q999983,1340,France,0.0,Writer
474185,Q999983,1340,Northwestern Europe,0.0,Writer


In [9]:
# Count the number of unique individual Wikidata IDs in the final occupation dataframe
unique_individuals = df_final_occupation['individual_wikidata_id'].nunique()
unique_individuals

139206

In [10]:
df_occupations.occupation.value_counts()


occupation
Writer               103061
Painter               53245
Musician              29267
Sculptor              13925
Architect             11599
Performing Artist      5736
Name: count, dtype: int64