In [1]:
import pandas as pd

from utilities import CUSHIONS, FAITH_ENCODING, GENDER_ENCODING, LINENS, PILLOWS


In [2]:
# Load the data
df = pd.read_csv('data/Sample_5.csv')
key = pd.read_csv('data/key.csv')

# add unique id
df['id'] = df.index

# add presence/absence columns for tablecloths, bedsheets, and tunics based on the totals
df['tablecloths'] = df['total_tablecloths'].apply(lambda x: 1 if x > 0 else 0)
df['bedsheets'] = df['total_bedsheets'].apply(lambda x: 1 if x > 0 else 0)
df['tunics'] = df['total_tunics'].apply(lambda x: 1 if x > 0 else 0)
# add presence/absence column for linens combined
df['linens'] = df[LINENS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add total_linens column as sum of tablecloths, bedsheets, and tunics
df['total_linens'] = df[['total_tablecloths', 'total_bedsheets', 'total_tunics']].sum(axis=1)

# add presence/absence column for dyestuffs based on the columns in the dyestuffs group (key)
dyestuffs_columns = key[key['group'] == 'dyestuff']['col_name'].tolist()
df['dyestuffs'] = df[dyestuffs_columns].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_dyestuffs' as sum of all dyestuffs columns
df['var_dyestuffs'] = df[dyestuffs_columns].sum(axis=1)

# add presence/absence column for exotics based on the columns in the exotic group (key)
exotics_columns = key[key['group'] == 'exotic']['col_name'].tolist()
df['exotic'] = df[exotics_columns].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_exotic' as sum of all exotics columns
df['var_exotic'] = df[exotics_columns].sum(axis=1)

# add presence/absence column for food_prep based on the columns in the food_preparation group (key)
food_prep_columns = key[key['group'] == 'food_preparation']['col_name'].tolist()
df['food_prep'] = df[food_prep_columns].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_food_prep' as sum of all food preparation columns
df['var_food_prep'] = df[food_prep_columns].sum(axis=1)

# add presence/absence column for household objects based on the columns in the household group (key)
household_columns = key[key['group'] == 'household']['col_name'].tolist()
df['household'] = df[household_columns].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_household' as sum of all household columns
df['var_household'] = df[household_columns].sum(axis=1)

# add presence/absence column for luxury objects based on the columns in the luxury group (key)
luxury_columns = key[key['group'] == 'luxury']['col_name'].tolist()
df['luxury'] = df[luxury_columns].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_luxury' as sum of all luxury columns
df['var_luxury'] = df[luxury_columns].sum(axis=1)

# add presence/absence column for weapons and armor based on the columns in the weapons_and_armour group (key)
weapons_columns = key[key['group'] == 'weapons_and_armour']['col_name'].tolist()
df['weapons'] = df[weapons_columns].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
# add 'var_weapons' as sum of all weapons and armor columns
df['var_weapons'] = df[weapons_columns].sum(axis=1)

# add presence/absence columns for cushions and pillows based on the respective subgoups in the key
df['cushions'] = df[CUSHIONS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)
df['pillows'] = df[PILLOWS].apply(lambda x: 1 if any(x > 0) else 0, axis=1)

# encode faith and gender as integers
df['faith'] = df['faith'].map(FAITH_ENCODING)
df['gender'] = df['gender'].map(GENDER_ENCODING)

# Replace NaN values in all columns with 0 and convert to integers
all_objects = key['col_name'].tolist()
target_cols = [
    'total_tablecloths',
    'total_bedsheets',
    'total_tunics',
    'object_phrases',
    'wealth_points',
    *all_objects,
]
df[target_cols] = df[target_cols].fillna(0).astype(int)


In [3]:
# save the dataframe to a new CSV file
df.to_csv('data/dataset.csv', index=False)
