In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
# load all csv files
data_1 = "princeton_20240101_20251015.csv" # 1/1/2024 to 10/15/2025
data_2 = "princeton_20230601_20231231.csv" # 6/1/2023 to 12/31/2023
data_3 = "princeton_lots_20220501_20230531_FINAL.csv" # 5/1/2022 to 5/31/2023
data_4 = "princeton.csv" # everything pre 5/1/2022

# read all csv files
df1 = pd.read_csv(data_1)
df2 = pd.read_csv(data_2)
df3 = pd.read_csv(data_3)
df4 = pd.read_csv(data_4)

df = pd.concat([df1, df2, df3, df4], ignore_index=True)

  df3 = pd.read_csv(data_3)
  df4 = pd.read_csv(data_4)


In [3]:
pd.set_option("display.max_columns", None)

# Cleaning datatypes

In [4]:
df_cleaned = df.copy()

In [5]:
# drop unused columns
df_cleaned = df_cleaned.drop(
    columns=[
        "raw_lot_id",
        "raw_auction_id",
        "lot_image_url",
        "artist_id",
    ]
)

In [6]:
# clean auction date
df_cleaned['auction_start_date'] = pd.to_datetime(
    df_cleaned['auction_start_date'],
    format='%Y-%m-%d',    
    errors='coerce'        
)

In [7]:
# clean prices
price_cols = [
    'price_usd_zeroied',
    'price_estimate_min_usd_zeroied',
    'price_estimate_max_usd_zeroied',
    'price_estimate_min',
    'price_estimate_max',
    'price_sold',
]

for c in price_cols:
    s = pd.to_numeric(df_cleaned[c], errors='coerce')
    df_cleaned[c] = s.round().astype('Int64') 

In [8]:
# clean bought_in as boolean
df_cleaned['bought_in'] = df_cleaned['price_sold'].isna().astype('boolean')

In [9]:
# change datatypes to integers
year_cols = [
    'artwork_creation_year',
    'artist_birth',
    'lot_num'
]

for c in year_cols:
    s = pd.to_numeric(df_cleaned[c], errors='coerce')
    df_cleaned[c] = s.round().astype('Int64')

In [10]:
# change remaining columns to strings
obj_cols = df_cleaned.select_dtypes(include=['object']).columns

df_cleaned[obj_cols] = df_cleaned[obj_cols].astype('string')

# Cleaning data values

In [11]:
# cleaning medium
df_cleaned["medium_final"] = (
    df_cleaned["medium_final"]
    .str.strip()   
    .str.lower()  
)

df_cleaned["medium_final"] = df_cleaned["medium_final"].replace({
    "painting": "paintings",
    "others": "other",
})

In [12]:
# cleaning auction house
mask = df_cleaned["auction_house_name"] == "Phillips de Pury & Company"
df_cleaned.loc[mask, "auction_house_name"] = "Phillips"

mask = df_cleaned["auction_house_name"] == "Bonhams & Butterfields"
df_cleaned.loc[mask, "auction_house_name"] = "Bonhams"

mask = df_cleaned["auction_house_name"] == "Poly Auction"
df_cleaned.loc[mask, "auction_house_name"] = "Poly International Auction"

mask = df_cleaned["auction_house_name"] == "Bonhams & Brooks"
df_cleaned.loc[mask, "auction_house_name"] = "Bonhams"

In [13]:
# cleaning auction locations
mask = df_cleaned["auction_location"] == "Baron Ribeyre & Associes, E. Farrando SVV, Paris"
df_cleaned.loc[mask, "auction_location"] = "Paris"

mask = df_cleaned["auction_location"] == "Via Pitteri, Milan"
df_cleaned.loc[mask, "auction_location"] = "Milan"

mask = df_cleaned["auction_location"] == "Poulain, F. Tajan, Paris"
df_cleaned.loc[mask, "auction_location"] = "Paris"

mask = df_cleaned["auction_location"] == "Paris - Drouot"
df_cleaned.loc[mask, "auction_location"] = "Paris"

mask = df_cleaned["auction_location"] == "Berlin-Grunewald"
df_cleaned.loc[mask, "auction_location"] = "Berlin"

mask = df_cleaned["auction_location"] == "Inc, Altadena"
df_cleaned.loc[mask, "auction_location"] = "Altadena"

mask = df_cleaned["auction_location"] == "Royere et Lajeunesse, Versailles"
df_cleaned.loc[mask, "auction_location"] = "Versailles"

mask = df_cleaned["auction_location"] == "Palais Dorotheum Wien"
df_cleaned.loc[mask, "auction_location"] = "Vienna"

mask = df_cleaned["auction_location"] == "Berlin-Charlottenburg"
df_cleaned.loc[mask, "auction_location"] = "Berlin"

mask = df_cleaned["auction_location"] == "Antwerp-Berchem"
df_cleaned.loc[mask, "auction_location"] = "Antwerp"

mask = df_cleaned["auction_location"] == "London, New Bond Street"
df_cleaned.loc[mask, "auction_location"] = "London"

mask = df_cleaned["auction_location"] == "London, Knightsbridge"
df_cleaned.loc[mask, "auction_location"] = "London"

mask = df_cleaned["auction_location"] == "Inc., New Orleans"
df_cleaned.loc[mask, "auction_location"] = "New Orleans"

mask = df_cleaned["auction_location"] == "Via Pontaccio, Milan"
df_cleaned.loc[mask, "auction_location"] = "Milan"

mask = df_cleaned["auction_location"] == "Inc., Dowington"
df_cleaned.loc[mask, "auction_location"] = "Dowington"

mask = df_cleaned["auction_location"] == "Inc., New York"
df_cleaned.loc[mask, "auction_location"] = "New York"

mask = df_cleaned["auction_location"] == "Ltd., Mystic"
df_cleaned.loc[mask, "auction_location"] = "Mystic"

mask = df_cleaned["auction_location"] == "Online, New York"
df_cleaned.loc[mask, "auction_location"] = "New York"

mask = df_cleaned["auction_location"] == "Lyon Brotteaux"
df_cleaned.loc[mask, "auction_location"] = "Lyon"

mask = df_cleaned["auction_location"] == "Inc., Columbia"
df_cleaned.loc[mask, "auction_location"] = "Columbia"

In [14]:
# standardize measurement units to centimeters
INCH_TO_CM = 2.54

mask = df_cleaned["artwork_measurements_unit"] == "inches"

df_cleaned.loc[mask, [
    "artwork_measurements_width",
    "artwork_measurements_height",
    "artwork_measurements_depth",
]] *= INCH_TO_CM

df_cleaned.loc[mask, "artwork_measurements_unit"] = "centimeters"

# New variables

In [15]:
# mid estimate
df_cleaned['price_estimate_mid_usd'] = (df_cleaned['price_estimate_min_usd_zeroied'] + df_cleaned['price_estimate_max_usd_zeroied']) / 2

In [16]:
# auction month and year
df_cleaned['auction_month'] = df_cleaned['auction_start_date'].dt.month
df_cleaned['auction_year'] = df_cleaned['auction_start_date'].dt.year

In [17]:
# hammer price
df_cleaned['price_kind'] = (
    df_cleaned['price_kind']
    .astype(str)
    .str.strip()
    .str.lower()
)

df_cleaned["hammer_price_usd_zeroied"] = np.where(
    df_cleaned["price_kind"].isin(["premium", "purchase"]),
    df_cleaned["price_usd_zeroied"] / 1.1375,
    df_cleaned["price_usd_zeroied"]
)

In [18]:
# signed
df_cleaned["signed"] = df_cleaned["artwork_markings"].str.contains(
    r"\bsigned\b",
    case=False,
    na=False
)

In [19]:
# has provenance
df_cleaned["has_provenance"] = (
    df_cleaned["artwork_provenance"].notna() &
    (df_cleaned["artwork_provenance"].str.strip() != "")
)

In [20]:
# exhibited
df_cleaned["exhibited"] = (
    df_cleaned["artwork_exhibited"].notna() &
    (df_cleaned["artwork_exhibited"].str.strip() != "")
)

In [21]:
# has literature
df_cleaned["has_literature"] = (
    df_cleaned["artwork_literature"].notna() &
    (df_cleaned["artwork_literature"].str.strip() != "")
)

In [22]:
# drop unused columns
df_cleaned = df_cleaned.drop(
    columns=[
        "price_estimate_min",
        "price_estimate_max",
        "price_sold",
        "price_usd_zeroied",
        "price_kind",
    ]
)

# Adjust prices for inflation

In [23]:
# load and read cpi data
cpi_data = "cpi.csv"
df_cpi = pd.read_csv(cpi_data)

df_cpi.loc[489, "Value"] = 324.8 # missing cpi for October 2025, so fill in with previous month

In [24]:
# clean cpi data
df_cpi["auction_month"] = df_cpi["Period"].str.replace("M", "", regex=False).astype(int)

df_cpi = df_cpi.rename(columns={
    "Year": "auction_year",
    "Value": "cpi"
})[["auction_year", "auction_month", "cpi"]]

df_cpi["cpi"] = pd.to_numeric(df_cpi["cpi"], errors="coerce")

In [25]:
# merge cpi data into df_cleaned
df_cleaned = df_cleaned.merge(
    df_cpi,
    on=["auction_year", "auction_month"],
    how="left"
)

In [26]:
# adjust prices for inflation
base_year = df_cleaned["auction_year"].max()
cpi_base = df_cleaned.loc[
    df_cleaned["auction_year"] == base_year, "cpi"
].mean()

price_cols = [
    "price_estimate_min_usd_zeroied",
    "price_estimate_max_usd_zeroied",
    "price_estimate_mid_usd",
    "hammer_price_usd_zeroied",
]

for col in price_cols:
    df_cleaned[f"{col}_real"] = (
        df_cleaned[col] * (cpi_base / df_cleaned["cpi"])
    )

# Artist details (continent, genre, gender, date of death)

## Continent

In [27]:
# clean artist nationalities
df_cleaned["artist_nationality"] = (
    df_cleaned["artist_nationality"]
    .astype(str)                 
    .str.strip()                 
    .replace("`", "", regex=False)  
    .str.lower()               
)

df_cleaned.loc[
    df_cleaned["artist_nationality"] == "",
    "artist_nationality"
] = pd.NA

In [28]:
# fix artist nationalities
df_cleaned.loc[df_cleaned["artist_name"].str.strip() == "Karina Ami","artist_nationality"] = "danish"

df_cleaned.loc[df_cleaned["artist_name"].str.strip() == "Jürgen Flohr","artist_nationality"] = "german"

df_cleaned.loc[df_cleaned["artist_name"].str.strip() == "Nancy Switzer","artist_nationality"] = "american"

df_cleaned.loc[df_cleaned["artist_name"].str.strip() == "Philippe Petit","artist_nationality"] = "french"

df_cleaned.loc[df_cleaned["artist_name"].str.strip() == "Ulala Imai","artist_nationality"] = "japanese"

df_cleaned.loc[df_cleaned["artist_nationality"] == "uzbek","artist_nationality"] = "uzbekistani"

df_cleaned.loc[df_cleaned["artist_nationality"] == "venezualian","artist_nationality"] = "venezuelan"
df_cleaned.loc[df_cleaned["artist_nationality"] == "venezualan","artist_nationality"] = "venezuelan"

df_cleaned.loc[df_cleaned["artist_nationality"] == "yugoslavian","artist_nationality"] = "yugoslav"

df_cleaned.loc[df_cleaned["artist_nationality"] == "tawainese","artist_nationality"] = "taiwanese"

df_cleaned.loc[df_cleaned["artist_nationality"] == "south koreanese","artist_nationality"] = "korean"
df_cleaned.loc[df_cleaned["artist_nationality"] == "south korean","artist_nationality"] = "korean"

df_cleaned.loc[df_cleaned["artist_nationality"] == "slovene","artist_nationality"] = "slovenian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "america","artist_nationality"] = "american"

df_cleaned.loc[df_cleaned["artist_nationality"] == "argentinian","artist_nationality"] = "argentine"

df_cleaned.loc[df_cleaned["artist_nationality"] == "austria","artist_nationality"] = "austrian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "bosniak","artist_nationality"] = "bosnian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "bristish","artist_nationality"] = "british"
df_cleaned.loc[df_cleaned["artist_nationality"] == "britsh","artist_nationality"] = "british"

df_cleaned.loc[df_cleaned["artist_nationality"] == "californian","artist_nationality"] = "american"

df_cleaned.loc[df_cleaned["artist_nationality"] == "cameroon","artist_nationality"] = "cameroonian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "croation","artist_nationality"] = "croatian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "denmark","artist_nationality"] = "danish"

df_cleaned.loc[df_cleaned["artist_nationality"] == "english","artist_nationality"] = "british"

df_cleaned.loc[df_cleaned["artist_nationality"] == "ethopian","artist_nationality"] = "ethiopian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "european/belgian","artist_nationality"] = "belgian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "federico) corchon y diaque (spanish","artist_nationality"] = "spanish"

df_cleaned.loc[df_cleaned["artist_nationality"] == "france","artist_nationality"] = "french"

df_cleaned.loc[df_cleaned["artist_nationality"] == "ghanian","artist_nationality"] = "ghanaian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "guyanan","artist_nationality"] = "guyanese"

df_cleaned.loc[df_cleaned["artist_nationality"] == "india","artist_nationality"] = "indian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "irelandish","artist_nationality"] = "irish"

df_cleaned.loc[df_cleaned["artist_nationality"] == "japan","artist_nationality"] = "japanese"
df_cleaned.loc[df_cleaned["artist_nationality"] == "janapese","artist_nationality"] = "japanese"

df_cleaned.loc[df_cleaned["artist_nationality"] == "kazak","artist_nationality"] = "kazakhstani"
df_cleaned.loc[df_cleaned["artist_nationality"] == "kazakh","artist_nationality"] = "kazakhstani"

df_cleaned.loc[df_cleaned["artist_nationality"] == "lebanon","artist_nationality"] = "lebanese"

df_cleaned.loc[df_cleaned["artist_nationality"] == "marrocan","artist_nationality"] = "moroccan"
df_cleaned.loc[df_cleaned["artist_nationality"] == "morrocan","artist_nationality"] = "moroccan"

df_cleaned.loc[df_cleaned["artist_nationality"] == "new zealander","artist_nationality"] = "new zealand"

df_cleaned.loc[df_cleaned["artist_nationality"] == "north american","artist_nationality"] = "native american"

df_cleaned.loc[df_cleaned["artist_nationality"] == "pineu) (french","artist_nationality"] = "french"

df_cleaned.loc[df_cleaned["artist_nationality"] == "saudi","artist_nationality"] = "saudi arabian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "serban","artist_nationality"] = "serbian"

df_cleaned.loc[df_cleaned["artist_nationality"] == "singapore","artist_nationality"] = "singaporean"

df_cleaned.loc[df_cleaned["artist_nationality"] == "utrecht","artist_nationality"] = "dutch"

df_cleaned.loc[df_cleaned["artist_nationality"] == "spanish american","artist_nationality"] = "spanish"

In [29]:
# continent sets 
AFRICA = {
    "african","algerian","angolan","beninese","botswanian","burkinabe","burundian",
    "cameroonian","congolese","egyptian","eritrean","ethiopian","ghanaian","guinean",
    "ivorian","kenyan","libyan","malagasy","malawian","malian","mauritian",
    "moroccan","mozambican","namibian","nigerian","rwandan","sao tomean",
    "senegalese","sierra leonean","south african","sudanese","swazi",
    "tanzanian","togolese","tunisian","ugandan","zairean","zambian","zimbabwean"
}

ASIA = {
    "afghan","armenian","azerbaijani","bahraini","balinese","bangladeshi","bengali",
    "bhutanese","cambodian","chinese","filipino","georgian","hong kongese","indian",
    "indonesian","iranian","iraqi","israeli","israeli-arab","japanese","javanese",
    "jordanian","kazakhstani","korean","kuwaiti","kyrgyzstani","laosean","lebanese",
    "malaysian","mongolian","nepalese","omani","pakistani","palestinian",
    "palestinian-saudi","persian","saudi arabian","singaporean","sri lankan",
    "syrian","taiwanese","thai","tibetan","turkish","turkmen","uzbekistani",
    "vietnamese","yemeni","mesopotamian","emirati","burmese","qatari","asian","israeli-arab"
}

EUROPE = {
    "albanian","austrian","belarusian","belgian","bohemian","bosnian","british",
    "bulgarian","croatian","cypriot","czech","danish","dutch","english","estonian",
    "faroese","finnish","flemish","french","german","greek","hungarian",
    "icelandic","irish","italian","latvian","liechtensteiner","lithuanian",
    "luxembourger","macedonian","maltese","moldovan","monacan","montenegrin",
    "neapolitan","norwegian","polish","portuguese","prussian","roman","romanian",
    "russian","scandinavian","scottish","serbian","slovak","slovenian","spanish",
    "swedish","swiss","ukrainian","welsh","yugoslav","european","j gilliard","j van haecken",
    "anglo-hungarian","anglo-scottish","british/irish","czech/german","french / italian",
    "hungarian-french","french/swiss","swiss-french","russian/french","russian/ukrainian"
}

NORTH_AMERICA = {
    "american","bahamian","barbadian","belizean","bermudian","canadian","costa rican",
    "cuban","dominican","grenadian","guatemalan","haitian","honduran","jamaican",
    "mexican","nicaraguan","panamanian","puerto rican","salvadoran","trinidadian",
    "antiguan","native american","caribbean"
}

SOUTH_AMERICA = {
    "argentine","bolivian","brazilian","chilean","colombian","ecuadorian",
    "paraguayan","peruvian","uruguayan","venezuelan","surinamese","guyanese","south american",
    "latin american"
}

OCEANIA = {
    "aboriginal australian","australian","new caledonian","new zealand",
    "papua new guinean","samoan","tahitian","cook islander","oceanic"
}

In [30]:
def nationality_to_continent(nat):
    if nat is None or pd.isna(nat):
        return pd.NA

    nat = str(nat).strip().lower()

    if nat == "" or nat == "<na>":
        return pd.NA

    if nat in AFRICA:
        return "africa"
    if nat in ASIA:
        return "asia"
    if nat in EUROPE:
        return "europe"
    if nat in NORTH_AMERICA:
        return "north america"
    if nat in SOUTH_AMERICA:
        return "south america"
    if nat in OCEANIA:
        return "oceania"

    # anything not explicitly assigned above
    return "intercontinental"

In [31]:
df_cleaned["artist_continent"] = df_cleaned["artist_nationality"].apply(
    nationality_to_continent
)

## Genre

In [33]:
df_cleaned["artist_genre"] = pd.NA

df_cleaned.loc[
    (df_cleaned["artist_birth"] >= 1250) & (df_cleaned["artist_birth"] <= 1820),
    "artist_genre"
] = "Old Masters"

df_cleaned.loc[
    (df_cleaned["artist_birth"] >= 1821) & (df_cleaned["artist_birth"] <= 1910),
    "artist_genre"
] = "Impressionist and Modern"

df_cleaned.loc[
    (df_cleaned["artist_birth"] >= 1911) & (df_cleaned["artist_birth"] <= 1974),
    "artist_genre"
] = "Postwar and Contemporary"

df_cleaned.loc[
    df_cleaned["artist_birth"] >= 1975,
    "artist_genre"
] = "Ultra-Contemporary"

## Gender

In [34]:
# unique artists
artists_raw = (
    df_cleaned[["artist_name"]]
    .dropna()
    .drop_duplicates()
    .reset_index(drop=True)
)

# stable copy for auditing
audit = artists_raw.copy()
audit["artist_name_original"] = audit["artist_name"]
audit["drop_reason"] = ""  # accumulate reasons

# work on a cleaned name column 
audit["artist_name_clean"] = audit["artist_name_original"]

# cleaning (does not drop)
audit["artist_name_clean"] = audit["artist_name_clean"].str.replace(
    r"^Attributed to\s+", "", regex=True
)
audit["artist_name_clean"] = audit["artist_name_clean"].str.replace(
    r"\s+and Workshop$", "", regex=True
)

# drop rules
def add_reason(mask, reason):
    audit.loc[mask, "drop_reason"] = (
        audit.loc[mask, "drop_reason"]
        .where(audit.loc[mask, "drop_reason"] == "", audit.loc[mask, "drop_reason"] + " | ")
        + reason
    )

# rule 1: starts with After/School of/Circle of OR contains digits/#/&
mask1 = audit["artist_name_clean"].str.contains(
    r"^(After|School of|Circle of)\s+|[0-9#&]",
    regex=True,
    na=False
)
add_reason(mask1, "after_school_or_digit_hash_amp")

# rule 2: title-like / subject-like starts
mask2 = audit["artist_name_clean"].str.contains(
    r"^(A|An|The|Saint|St\.|Holy|Our Lady|Madonna|Baroque)\b",
    regex=True,
    na=False
)
add_reason(mask2, "starts_with_article_or_religious_or_baroque")

# rule 3: contains object words
OBJECT_WORDS = [
    "brooch", "bracelet", "necklace", "ring", "jug", "can", "vase", "bowl", "ink",
    "painting", "panel", "plaque", "albarello", "reliquary", "sculpture",
    "furniture", "cabinet", "table", "chair", "box", "casket", "triptych", "diptych",
    "watch", "clock", "frame", "icon", "drawers", "tables", "mirror", "tapestry",
    "desk", "chairs", "sofa", "chandelier", "candelabras", "with"
]
pattern = r"\b(" + "|".join(OBJECT_WORDS) + r")\b"

mask3 = audit["artist_name_clean"].str.contains(
    pattern,
    case=False,
    regex=True,
    na=False
)
add_reason(mask3, "contains_object_word")

# kept vs dropped
dropped = audit[audit["drop_reason"] != ""].copy()
kept = audit[audit["drop_reason"] == ""].copy()

artists = kept[["artist_name_clean"]].rename(columns={"artist_name_clean": "artist_name"}).reset_index(drop=True)

# write outputs
artists.to_csv("artists_unique.csv", index=False)

# save dropped with both original + cleaned + reasons
dropped_out = dropped[["artist_name_original", "artist_name_clean", "drop_reason"]].reset_index(drop=True)
dropped_out.to_csv("artists_dropped.csv", index=False)

print(f"Starting unique artists: {len(artists_raw):,}")
print(f"Kept: {len(artists):,}")
print(f"Dropped: {len(dropped_out):,}")

  mask1 = audit["artist_name_clean"].str.contains(
  mask2 = audit["artist_name_clean"].str.contains(
  mask3 = audit["artist_name_clean"].str.contains(


Starting unique artists: 331,543
Kept: 312,977
Dropped: 18,566


In [35]:
# scraped Wikidata on local computer

In [36]:
# load and read gender data
gender_data = "artist_gender_lookup.csv"
df_gender = pd.read_csv(gender_data)

In [37]:
# merge into main df
df_cleaned = df_cleaned.merge(df_gender, on="artist_name",how="left")

## Death

In [38]:
# load and read death data
death_data = "artist_vitals_lookup.csv"
df_death = pd.read_csv(death_data)

In [39]:
# clean death data
df_death = df_death.rename(columns={"vital_status":"artist_dead", "death_date":"artist_death_date"})

df_death['artist_death_date'] = pd.to_datetime(
    df_death['artist_death_date'],
    format='%Y-%m-%d',    
    errors='coerce'        
)

df_death["artist_dead"] = df_death["artist_dead"].eq("Dead").astype("boolean")

In [40]:
# merge into main df
df_cleaned = df_cleaned.merge(
    df_death[["artist_name", "artist_dead", "artist_death_date"]],
    on="artist_name",
    how="left"
)

In [41]:
# add year of death
df_cleaned["artist_death_year"] = df_cleaned["artist_death_date"].dt.year

# Major events

### Exhibitions and retrospectives
* Magiciens de la terre (1989): counteracted ethoncentric practices within the contemporary art world
* The Other Story (1989-1990): Asian, African and Caribbean artists in post war Britain
* documenta X (1997): experimental performance space and digital platform
* Sensation (1997): Young British Artists, market-museum nexus
* Cities on the Move (1997-1999): cultural impact of East Asia's rapid urban development
* WACK! Art and the Feminist Revolution (2007): international women's art
* Marina Abramović: The Artist is Present (2010)
* Ai Weiwei: Sunflower Seeds (2010)
* Niki de Saint Phalle: Grand Palais (2014-2015)
* Anish Kapoor: Versailles (2015)
* Soul of a Nation: Art in the Age of Black Power (2017)
* Artistes & Robots (2018): art + AI/robotics
* Hilma af Klint: Paintings for the Future (2018-2019)
Themes: Globalization/postcolonial/decolonial turning points, Museum-as-spectacle (Abramovic & Ai Weiwei), Feminist (WACK, Niki), Tech-art

### Other events
* Sotheby's/Christie's price-fixing scandal (2000-2001)
* Buyer's premium schedule jumps - transaction cost shock (2013)
* Sotheby's goes private (2019)
* GFC (2008-2009)
* U.S. TCJA eliminates 1031 like-kind exchanges for personal property (art) (2018)
* U.S. AML Act (art-market scrutiny accelerates) (2021)
* Salvator Mundi sale (2017)
* First mainstream "online-only" sales initiatives (2013)

# Prepare for Stata

In [42]:
# shorten column names
df_cleaned = df_cleaned.rename(columns={"price_estimate_min_usd_zeroied":"price_estimate_min_usd",
                                        "price_estimate_max_usd_zeroied":"price_estimate_max_usd",
                                        "hammer_price_usd_zeroied":"hammer_price_usd",
                                        "price_estimate_min_usd_zeroied_real":"price_estimate_min_usd_real",
                                        "price_estimate_max_usd_zeroied_real":"price_estimate_max_usd_real",
                                        "hammer_price_usd_zeroied_real":"hammer_price_usd_real"})

In [43]:
# convert booleans to integers
bool_cols = df_cleaned.select_dtypes(include="boolean").columns
df_cleaned[bool_cols] = df_cleaned[bool_cols].fillna(False).astype("int8")

In [44]:
# convert strings to objects
str_cols = df_cleaned.select_dtypes(include="string").columns
df_cleaned[str_cols] = df_cleaned[str_cols].astype("object")

In [45]:
# fix year variables
year_cols = ["auction_year", "auction_month", "artist_death_year"]

for c in year_cols:
    if c in df_cleaned.columns:
        df_cleaned[c] = df_cleaned[c].astype("Int64")

In [46]:
# convert datetimes to dates
df_cleaned["auction_start_date"] = pd.to_datetime(df_cleaned["auction_start_date"], errors="coerce")
df_cleaned["artist_death_date"]  = pd.to_datetime(df_cleaned["artist_death_date"],  errors="coerce")

In [47]:
# drop unnecessary columns
drop_cols = [
    "artwork_condition_in", "artwork_description", "artwork_exhibited", "artwork_literature", "artwork_markings", "artwork_materials",
    "artwork_provenance", "artwork_size_notes", "catalog_notes", "auction_num", "artwork_edition_current", "artwork_edition_size", "cpi" 
]
df_export = df_cleaned.drop(columns=[c for c in drop_cols if c in df_cleaned.columns])

In [48]:
# export to stata
df_export.to_parquet("df_cleaned_reg.parquet", index=False)

In [49]:
df_export

Unnamed: 0,auction_house_name,auction_location,auction_name,auction_start_date,lot_num,bought_in,currency,artwork_creation_year,artwork_measurements_width,artwork_measurements_height,artwork_measurements_depth,artwork_measurements_unit,artwork_name,medium_final,price_estimate_min_usd,price_estimate_max_usd,artist_name,artist_nationality,artist_birth,price_estimate_mid_usd,auction_month,auction_year,hammer_price_usd,signed,has_provenance,exhibited,has_literature,price_estimate_min_usd_real,price_estimate_max_usd_real,price_estimate_mid_usd_real,hammer_price_usd_real,artist_continent,artist_genre,artist_gender,artist_dead,artist_death_date,artist_death_year
0,Sotheby's,New York,A Scholar Collects,2024-01-31,10,0,USD,1826,17.7,14.200,,centimeters,Landscape of the Ardennes with the Church of M...,works on paper,12000,16000,Elisabeth-Louise Vigée Le Brun,french,1755,14000.0,1,2024,2.232967e+04,0,1,1,1,12514.00939,16685.345854,14599.677622,2.328614e+04,europe,Old Masters,Female,1,1842-03-30,1842
1,Sotheby's,New York,A Scholar Collects,2024-01-31,9,0,USD,1821,20.9,15.000,,centimeters,Sky study,works on paper,12000,16000,Elisabeth-Louise Vigée Le Brun,french,1755,14000.0,1,2024,3.907692e+04,0,1,1,1,12514.00939,16685.345854,14599.677622,4.075075e+04,europe,Old Masters,Female,1,1842-03-30,1842
2,Sotheby's,New York,A Scholar Collects,2024-01-31,20,0,USD,,27.4,32.000,,centimeters,"A young woman wearing pearl earrings, with flo...",works on paper,60000,80000,Rosalba Carriera,italian,1675,70000.0,1,2024,1.674725e+05,0,1,1,1,62570.046952,83426.729269,72998.38811,1.746461e+05,europe,Old Masters,Female,1,1757-04-15,1757
3,Sotheby's,New York,A Scholar Collects,2024-01-31,11,0,USD,,38.1,50.165,,centimeters,Self-Portrait In Traveling Costume,works on paper,700000,1000000,Elisabeth-Louise Vigée Le Brun,french,1755,850000.0,1,2024,2.712088e+06,0,1,1,1,729983.881103,1042834.115862,886408.998483,2.828258e+06,europe,Old Masters,Female,1,1842-03-30,1842
4,Sotheby's,New York,A Scholar Collects,2024-01-31,8,0,USD,,28.2,43.200,,centimeters,Profile Portrait of the Duchesse de Polignac,works on paper,120000,180000,Elisabeth-Louise Vigée Le Brun,french,1755,150000.0,1,2024,2.679560e+05,0,1,1,1,125140.093903,187710.140855,156425.117379,2.794337e+05,europe,Old Masters,Female,1,1842-03-30,1842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6283071,Sotheby's,New York,American Art,2020-06-26,45,1,USD,2020,49.5,30.500,,centimeters,SECONNET POINT FROM THE EAST END,paintings,60000,80000,Worthington Whittredge,american,1820,70000.0,6,2020,0.000000e+00,1,1,0,0,74856.054069,99808.072091,87332.06308,0.000000e+00,north america,Old Masters,Male,1,1910-02-25,1910
6283072,Sotheby's,New York,American Art,2020-06-26,48,0,USD,1850,76.8,51.400,,centimeters,"SCHOONER ""LOO CHOO"" IN A STORMY SEA",paintings,80000,120000,Fitz Henry Lane,american,1804,100000.0,6,2020,8.241758e+04,1,1,0,0,99808.072091,149712.108137,124760.090114,1.028243e+05,north america,Old Masters,Male,1,1865-08-13,1865
6283073,Sotheby's,New York,American Art,2020-06-26,49,0,USD,1850,76.8,51.400,,centimeters,"SCHOONER ""LOO CHOO"" IN A CALM SEA",paintings,60000,80000,Fitz Henry Lane,american,1804,70000.0,6,2020,2.747253e+05,1,1,0,0,74856.054069,99808.072091,87332.06308,3.427475e+05,north america,Old Masters,Male,1,1865-08-13,1865
6283074,Sotheby's,New York,American Art,2020-06-26,51,0,USD,1830,45.7,34.900,,centimeters,"SOUTHERN SIERRA, MOUNT WHITNEY REGION",paintings,20000,30000,Albert Bierstadt,german,1830,25000.0,6,2020,1.868132e+04,1,1,0,0,24952.018023,37428.027034,31190.022529,2.330683e+04,europe,Impressionist and Modern,Male,1,1902-02-18,1902
