In [1]:
import pandas as pd

from utilities import (
    ARMOUR,
    CUSHIONS,
    DYESTUFFS,
    EXOTIC,
    FAITH_DECODING,
    FAITH_ENCODING,
    FOOD_PREPARATION,
    GENDER_DECODING,
    GENDER_ENCODING,
    HOUSEHOLD_ALL,
    LINENS,
    LUXURY,
    MELEE_WEAPONS,
    PILLOWS,
    POLEARMS,
    PROJECTILE_WEAPONS,
    SHIELDS,
    WEAPONS_AND_ARMOUR_ALL,
)


In [2]:
# Load the data
df = pd.read_csv('data/Sample_5.csv')
key = pd.read_csv('data/key.csv')

# add unique id
df['id'] = df.index

# add presence/absence columns for tablecloths, bedsheets, and tunics based on the totals
df['tablecloths'] = df['total_tablecloths'].apply(lambda x: 1 if x > 0 else 0)
df['bedsheets'] = df['total_bedsheets'].apply(lambda x: 1 if x > 0 else 0)
df['tunics'] = df['total_tunics'].apply(lambda x: 1 if x > 0 else 0)
# add presence/absence column for linens combined
df['linens'] = df[LINENS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add total_linens column as sum of tablecloths, bedsheets, and tunics
df['total_linens'] = df[['total_tablecloths', 'total_bedsheets', 'total_tunics']].sum(axis=1)

# add presence/absence column for dyestuffs based on the columns in the dyestuffs group (key)
df['dyestuffs'] = df[DYESTUFFS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_dyestuffs' as sum of all dyestuffs columns
df['var_dyestuffs'] = df[DYESTUFFS].sum(axis=1)

# add presence/absence column for exotics based on the columns in the exotic group (key)
df['exotic'] = df[EXOTIC].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_exotic' as sum of all exotics columns
df['var_exotic'] = df[EXOTIC].sum(axis=1)

# add presence/absence column for food_prep based on the columns in the food_preparation group (key)
df['food_prep'] = df[FOOD_PREPARATION].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_food_prep' as sum of all food preparation columns
df['var_food_prep'] = df[FOOD_PREPARATION].sum(axis=1)

# add presence/absence column for household objects based on the columns in the household group (key)
df['household'] = df[HOUSEHOLD_ALL].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_household' as sum of all household columns
df['var_household'] = df[HOUSEHOLD_ALL].sum(axis=1)

# add presence/absence column for luxury objects based on the columns in the luxury group (key)
df['luxury'] = df[LUXURY].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_luxury' as sum of all luxury columns
df['var_luxury'] = df[LUXURY].sum(axis=1)

# add presence/absence column for weapons and armor based on the columns in the weapons_and_armour group (key)
df['weapons'] = df[WEAPONS_AND_ARMOUR_ALL].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_weapons' as sum of all weapons and armor columns
df['var_weapons'] = df[WEAPONS_AND_ARMOUR_ALL].sum(axis=1)

# add presence/absence columns for weapons and armour based on the respective subgoups in the key
df['armour'] = df[ARMOUR].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
df['melee_weapons'] = df[MELEE_WEAPONS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
df['polearms'] = df[POLEARMS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
df['projectile_weapons'] = df[PROJECTILE_WEAPONS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
df['shields'] = df[SHIELDS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)

# add presence/absence columns for cushions and pillows based on the respective subgoups in the key
df['cushions'] = df[CUSHIONS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
df['pillows'] = df[PILLOWS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)

# encode faith and gender as integers
df['faith'] = df['faith'].map(FAITH_ENCODING)
df['gender'] = df['gender'].map(GENDER_ENCODING)

# add labels for the encoded columns
df['gender_label'] = df['gender'].map(GENDER_DECODING)
df['faith_label'] = df['faith'].map(FAITH_DECODING)

# Replace NaN values in all columns with 0 and convert to integers
all_objects = key['col_name'].tolist()
target_cols = [
    'total_tablecloths',
    'total_bedsheets',
    'total_tunics',
    'object_phrases',
    'wealth_points',
    *all_objects,
]
df[target_cols] = df[target_cols].fillna(0).astype(int)


In [3]:
# save the dataframe to a new CSV file
df.to_csv('data/dataset.csv', index=False)
