In [6]:
# 06_enrichment_artworks.ipynb

import pandas as pd
import numpy as np
import os
import re

# Load cleaned dataset
df = pd.read_csv("../outputs/artworks_cleaned.csv")

# ----------------------------
# 1. Create 'century' column
# ----------------------------
def get_century(year):
    if np.isnan(year):
        return 'Unknown'
    return f"{int((year - 1) // 100 + 1)}th Century"

df['century'] = df['year'].apply(get_century)

# ----------------------------
# 2. Gender flags
# ----------------------------
df['is_male'] = df['gender'] == 'Male'
df['is_female'] = df['gender'] == 'Female'
df['is_unknown_gender'] = df['gender'] == 'Unknown'

# ----------------------------
# 3. Artist known flag
# ----------------------------
df['is_known_artist'] = df['artist'] != 'Unknown'

# ----------------------------
# 4. Extract material from 'medium'
# ----------------------------
def extract_material(medium):
    if pd.isna(medium) or medium == 'Unknown':
        return 'Unknown'
    medium = medium.lower()
    materials = [
        'oil', 'acrylic', 'graphite', 'ink', 'bronze', 'wood',
        'paper', 'photograph', 'print', 'ceramic', 'video',
        'digital', 'glass', 'plastic', 'metal', 'charcoal', 'pastel'
    ]
    for material in materials:
        if material in medium:
            return material.capitalize()
    return 'Other'

df['material'] = df['medium'].apply(extract_material)

# ----------------------------
# 5. Define artistic era
# ----------------------------
def get_era(year):
    if pd.isna(year):
        return 'Unknown'
    if year >= 1945:
        return 'Contemporary'
    elif year >= 1850:
        return 'Modern'
    else:
        return 'Other'

df['era'] = df['year'].apply(get_era)

# ----------------------------
# 6. Extract country from nationality
# ----------------------------
def clean_country(nationality):
    if pd.isna(nationality) or nationality == 'Unknown':
        return 'Unknown'
    nat = nationality.lower().strip()
    replacements = {
        "american": "USA",
        "french": "France",
        "british": "UK",
        "english": "UK",
        "german": "Germany",
        "spanish": "Spain",
        "italian": "Italy",
        "japanese": "Japan",
        "chinese": "China",
        "canadian": "Canada",
        "mexican": "Mexico",
        "dutch": "Netherlands",
        "russian": "Russia",
        "swiss": "Switzerland",
        "austrian": "Austria"
    }
    for key, country in replacements.items():
        if key in nat:
            return country
    return 'Other'

df['country'] = df['nationality'].apply(clean_country)

# ----------------------------
# 7. Calculate artist age at acquisition
# ----------------------------
def calculate_age(row):
    try:
        if np.isnan(row['begindate']) or pd.isnull(row['dateacquired']):
            return np.nan
        return int(pd.to_datetime(row['dateacquired']).year - row['begindate'])
    except:
        return np.nan

df['artist_age_at_acquisition'] = df.apply(calculate_age, axis=1)


df['gender'] = df['gender'].astype(str).str.strip("() ").str.capitalize()
df['nationality'] = df['nationality'].astype(str).str.strip("() ").str.title()

# ----------------------------
# 8. Save enriched dataset
# ----------------------------
df.to_csv("../outputs/artworks_enriched.csv", index=False)

# ----------------------------
# 9. Summary
# ----------------------------
print("Enriched dataset shape:", df.shape)
print(df[['year', 'century', 'gender', 'is_known_artist', 'material', 'era', 'country', 'artist_age_at_acquisition']].head())


  exec(code_obj, self.user_global_ns, self.user_ns)


Enriched dataset shape: (263444, 39)
     year       century gender  is_known_artist    material           era  \
0  1896.0  19th Century   Male             True         Ink        Modern   
1  1987.0  20th Century   Male             True       Print  Contemporary   
2  1903.0  20th Century   Male             True    Graphite        Modern   
3  1980.0  20th Century   Male             True  Photograph  Contemporary   
4  1903.0  20th Century   Male             True    Graphite        Modern   

   country  artist_age_at_acquisition  
0  Austria                        NaN  
1   France                        NaN  
2  Austria                        NaN  
3    Other                        NaN  
4  Austria                        NaN  


In [5]:
df.head(10)  # Display first 10 rows for verification

Unnamed: 0,title,artist,constituentid,artistbio,nationality,begindate,enddate,gender,date,medium,...,year,century,is_male,is_female,is_unknown_gender,is_known_artist,material,era,country,artist_age_at_acquisition
0,"Ferdinandsbrücke Project, Vienna, Austria (Ele...",Otto Wagner,6210,"(Austrian, 1841–1918)",Austrian,,,Male,1896,Ink and cut-and-pasted painted pages on paper,...,1896.0,19th Century,False,False,False,True,Ink,Modern,Austria,
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,"(French, born 1944)",French,,,Male,1987,Paint and colored pencil on print,...,1987.0,20th Century,False,False,False,True,Print,Contemporary,France,
2,"Villa project, outside Vienna, Austria (Elevat...",Emil Hoppe,7605,"(Austrian, 1876–1957)",Austrian,,,Male,1903,"Graphite, pen, color pencil, ink, and gouache ...",...,1903.0,20th Century,False,False,False,True,Graphite,Modern,Austria,
3,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",,,,Male,1980,Photographic reproduction with colored synthet...,...,1980.0,20th Century,False,False,False,True,Photograph,Contemporary,Other,
4,"Villa project, outside Vienna, Austria (Exteri...",Emil Hoppe,7605,"(Austrian, 1876–1957)",Austrian,,,Male,1903,"Graphite, color pencil, ink, and gouache on tr...",...,1903.0,20th Century,False,False,False,True,Graphite,Modern,Austria,
5,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",,,,Male,1976-77,Gelatin silver photograph,...,1976.0,20th Century,False,False,False,True,Photograph,Contemporary,Other,
6,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",,,,Male,1976-77,Gelatin silver photographs,...,1976.0,20th Century,False,False,False,True,Photograph,Contemporary,Other,
7,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",,,,Male,1976-77,Gelatin silver photograph,...,1976.0,20th Century,False,False,False,True,Photograph,Contemporary,Other,
8,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",,,,Male,1976-77,Gelatin silver photograph,...,1976.0,20th Century,False,False,False,True,Photograph,Contemporary,Other,
9,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",,,,Male,1976-77,Gelatin silver photograph,...,1976.0,20th Century,False,False,False,True,Photograph,Contemporary,Other,
