In [1]:
import pandas as pd

from utilities import (
    DYESTUFF_COLS,
    EXOTIC_COLS,
    FAITH_ENCODINGS,
    FULL_OBJECT_COLS,
    GENDER_ENCODINGS,
    WEAPONS_AND_ARMOR_COLS,
)


In [2]:
TEMP_COLS = [
    'bwe_object_points',
    'bwe_all_linens',
    'bwe_linen_points',
    'bwe_all_exotic',
    'bwe_exotic_points',
    'bwe_all_dyestuffs',
    'bwe_dyestuff_points',
    'bwe_all_arms_armor',
    'bwe_arms_armor_points',
]

In [3]:
# Load the data
df = pd.read_csv('data/Sample_4.csv')

# add unique id
df['id'] = df.index

# drop temporary columns
df.drop(columns=TEMP_COLS, inplace=True, errors='ignore')

# remove cardinality suffixes (i.e. st, rd, th) from estimated_wealth_quartile
df['wealth_quartile'] = df['wealth_quartile'].str.replace(r'(\d+)(st|nd|rd|th)', r'\1', regex=True)

# add presence/absence columns for tablecloths, bedsheets, and tunics based on the totals
df['tablecloths'] = df['total_tablecloths'].apply(lambda x: 1 if x > 0 else 0)
df['bedsheets'] = df['total_bedsheets'].apply(lambda x: 1 if x > 0 else 0)
df['tunics'] = df['total_tunics'].apply(lambda x: 1 if x > 0 else 0)

# add presence/absence column for linens combined
df['linens'] = df[['tablecloths', 'bedsheets', 'tunics']].apply(lambda x: 1 if any(x > 0) else 0, axis=1)

# add total_linens column as sum of tablecloths, bedsheets, and tunics
df['total_linens'] = df[['total_tablecloths', 'total_bedsheets', 'total_tunics']].sum(axis=1)

# add presence/absence column for weapons and armor based on the weapons_and_armor_cols
df['weapons'] = df[WEAPONS_AND_ARMOR_COLS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)

# add 'var_weapons' as sum of all weapons and armor columns, indicating strength of weapons and armor variability
df['var_weapons'] = df[WEAPONS_AND_ARMOR_COLS].sum(axis=1)

# add 'var_exotic' as sum of all luxury object columns
df['var_exotic'] = df[EXOTIC_COLS].sum(axis=1)

# add 'var_dyestuff' as sum of all dyestuffs columns
df['var_dyestuff'] = df[DYESTUFF_COLS].sum(axis=1)

# encode faith and gender
df['faith'] = df['faith'].map(FAITH_ENCODINGS)
df['gender_decedent'] = df['gender_decedent'].map(GENDER_ENCODINGS)

# Replace NaN values in columns with 0 and convert to integers
target_cols = [
    'total_tablecloths',
    'total_bedsheets',
    'total_tunics',
    'object_phrases',
    'wealth_points',
    'wealth_quartile',
    *FULL_OBJECT_COLS,
]
df[target_cols] = df[target_cols].fillna(0).astype(int)


In [4]:
# save the dataframe to a new CSV file
df.to_csv('data/dataset.csv', index=False)
